Load libraries

suppressMessages(library(tidyverse))
suppressMessages(library(downloader))
suppressMessages(library(plotly))
suppressMessages(library(car))
suppressMessages(library(dunn.test))
suppressMessages(library(broom))

1) Read in the gapminder_clean.csv data as a tibble using read_csv

dat <- read.csv('gapminder_clean.csv')
dim(dat)
## [1] 2607   20

The gapminder data consists of a matrix with 2607 rows and 20 columns.

2) Filter the data to include only rows where Year is 1962 and then make a scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap for the filtered data

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
  ggplot(aes(x = gdpPercap, y = CO2.emissions..metric.tons.per.capita.)) +
  geom_point() +
  ylab("CO2 emissions (metric tons per capita)")

# Because of one very large outlier the analysis repreated with log transformed data

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
  ggplot(aes(x = log(gdpPercap), y = log(CO2.emissions..metric.tons.per.capita.))) +
  geom_point() +
  ylab("log(CO2 emissions (metric tons per capita))")

Log transformed data can be used to make data as “normal” as possible so that the statistical analysis results become more valid.

3) On the filtered data, calculate the pearson correlation of ‘CO2 emissions (metric tons per capita)’ and gdpPercap. What is the Pearson R value and associated p value?

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
  pivot_longer( gdpPercap, names_to = "x_var", values_to = "x_val") %>%
  pivot_longer( CO2.emissions..metric.tons.per.capita., names_to = "y_var", values_to = "y_val") %>%
  group_by(x_var, y_var) %>%
   summarise(cor_coef = cor.test(x_val, y_val)$estimate,
            p_val = cor.test(x_val, y_val)$p.value)
## `summarise()` has grouped output by 'x_var'. You can override using the `.groups` argument.
## # A tibble: 1 x 4
## # Groups:   x_var [1]
##   x_var     y_var                                  cor_coef    p_val
##   <chr>     <chr>                                     <dbl>    <dbl>
## 1 gdpPercap CO2.emissions..metric.tons.per.capita.    0.926 1.13e-46

The Pearson R value of ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.9260817 and the associated p value is 1.128679e-46.

# Because of one very large outlier the analysis repreated with log transformed data

dat %>% 
  filter(Year == 1962) %>%
  # remove rows which contain NAs
  filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
  pivot_longer( gdpPercap, names_to = "x_var", values_to = "x_val") %>%
  pivot_longer( CO2.emissions..metric.tons.per.capita., names_to = "y_var", values_to = "y_val") %>%
  group_by(x_var, y_var) %>%
  # calculate pearson R value and associated p value
  summarise(cor_coef = cor.test(log(x_val), log(y_val))$estimate,
            p_val = cor.test(log(x_val), log(y_val))$p.value)
## `summarise()` has grouped output by 'x_var'. You can override using the `.groups` argument.
## # A tibble: 1 x 4
## # Groups:   x_var [1]
##   x_var     y_var                                  cor_coef    p_val
##   <chr>     <chr>                                     <dbl>    <dbl>
## 1 gdpPercap CO2.emissions..metric.tons.per.capita.    0.860 8.90e-33

The Pearson R value of log transformed ‘CO2 emissions (metric tons per capita)’ and gdpPercap in Year 1962 is 0.8602081 and the associated p value is 8.903567e-33.

4) On the unfiltered data, answer “In what year is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest?” Filter the dataset to that year for the next step…

year.cor.max <- dat %>%
  # remove rows which contain NAs
  filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
  # split origial tibble in lists depending on the year
  split(.$Year) %>%
  # perform pearson correlation test on each year list, keep only the pearson R value
  map(~cor.test(log(.$CO2.emissions..metric.tons.per.capita.), log(.$gdpPercap))$estimate) %>% 
  # merge list of lists to one final list
  unlist

year.cor.max
##  1962.cor  1967.cor  1972.cor  1977.cor  1982.cor  1987.cor  1992.cor  1997.cor 
## 0.8602081 0.8736773 0.8843802 0.9014278 0.9151610 0.9080487 0.8934602 0.9208183 
##  2002.cor  2007.cor 
## 0.9300017 0.9275425
year.cor.max[which.max(year.cor.max)]
##  2002.cor 
## 0.9300017

In year 2002 is the correlation between ‘CO2 emissions (metric tons per capita)’ and gdpPercap the strongest.

5) Using plotly, create an interactive scatter plot comparing ‘CO2 emissions (metric tons per capita)’ and gdpPercap, where the point size is determined by pop (population) and the color is determined by the continent. You can easily convert any ggplot plot to a plotly plot using the ggplotly() command.

viz <- dat %>% 
    filter(Year == 2002) %>%
    # remove rows which contain NAs
    filter(across(c(CO2.emissions..metric.tons.per.capita., gdpPercap), ~ !is.na(.))) %>%
    ggplot(aes(x = log(gdpPercap), y = log(CO2.emissions..metric.tons.per.capita.), text=paste0('</br>log(gdpPercap): ', gdpPercap, '</br>log(CO2 emissions (metric tons per capita)): ', CO2.emissions..metric.tons.per.capita., '</br>population: ', pop, '</br>continent: ', continent, '</br>Country Name: ', Country.Name))) +
    geom_point(aes(size = pop, color = continent)) +
    ylab("log(CO2 emissions (metric tons per capita))")
  
ggplotly(viz, tooltip = c("text"))

6) What is the relationship between continent and ‘Energy use (kg of oil equivalent per capita)’?

First we must choose a suitable statistical test. We have continuous data and want to check for differences in mean for more than two groups. 

Therefore, we have to check if the parametric assumptions are satisfied. For ANOVA, we have four parametric assumptions that must be met:

6.1) Samples must be independent

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.))) %>%
  ggplot(aes(x=continent, y=Energy.use..kg.of.oil.equivalent.per.capita., fill = continent)) +
  geom_boxplot() +
  ylab("Energy use (kg of oil equivalent per capita)")

First, exploratory data analysis (EDA) was performed on the data. We can say that the samples are independent.

6.2) Population variances must be equal (Levene’s test)

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.))) %>%
  # perform Levene’s test
  do(tidy(leveneTest(.$Energy.use..kg.of.oil.equivalent.per.capita.~.$continent, data = .)))
## Warning in leveneTest.default(y = y, group = group, ...): group coerced to
## factor.
## # A tibble: 1 x 4
##   statistic  p.value    df df.residual
##       <dbl>    <dbl> <int>       <int>
## 1      12.4 8.00e-10     4         843

We reject the H0 hypothesis that the variances are equal based on the p value of our Levene’s test. Therefore, we can not use an ANOVA test and must use a Kruskal-Wallis test instead.

6.3) Kruskal-Wallis test

dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.))) %>%
  # perform Kruskal-Wallis test
  do(tidy(kruskal.test(.$Energy.use..kg.of.oil.equivalent.per.capita.~.$continent, data = .)))
## # A tibble: 1 x 4
##   statistic  p.value parameter method                      
##       <dbl>    <dbl>     <int> <chr>                       
## 1      319. 1.01e-67         4 Kruskal-Wallis rank sum test

The resulting p value of our Kruskal-Wallis test shows that there is a significant difference in the mean for our data. Because of our significant result, we additionally perform a dunn’s test.

dat6 <- dat %>%
  filter(continent != "") %>%
  # remove rows which contain NAs
  filter(across(Energy.use..kg.of.oil.equivalent.per.capita., ~ !is.na(.)))

# perform dunn's test
dunn.test(dat6$Energy.use..kg.of.oil.equivalent.per.capita., g=dat6$continent)
##   Kruskal-Wallis rank sum test
## 
## data: x and group
## Kruskal-Wallis chi-squared = 318.6763, df = 4, p-value = 0
## 
## 
##                            Comparison of x by group                            
##                                 (No adjustment)                                
## Col Mean-|
## Row Mean |     Africa   Americas       Asia     Europe
## ---------+--------------------------------------------
## Americas |  -6.174961
##          |    0.0000*
##          |
##     Asia |  -4.852592   1.278883
##          |    0.0000*     0.1005
##          |
##   Europe |  -16.43361  -9.630909  -10.95868
##          |    0.0000*    0.0000*    0.0000*
##          |
##  Oceania |  -8.148201  -5.456302  -6.014711  -1.543152
##          |    0.0000*    0.0000*    0.0000*     0.0614
## 
## alpha = 0.05
## Reject Ho if p <= alpha/2

7) Is there a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990?

Again, we must choose a suitable test first. We have continuous data and want to check for differences in mean for two groups. 

Therefore, we have to check if the parametric assumptions are satisfied. For Student’s unpaired t-test, we have four parametric assumptions that must be met:

7.1) The observations are sampled independently

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter(across(Imports.of.goods.and.services....of.GDP., ~ !is.na(.))) %>%
  ggplot(aes(x=continent, y=Imports.of.goods.and.services....of.GDP., fill = continent)) +
  geom_boxplot() +
  ylab("Imports of goods and services (% of GDP)")

First, exploratory data analysis (EDA) was performed on the data. We can say that the samples are independent. We can also say that the dependent variable is measured on the incremental level year and that independent variables consist of matched pairs.

7.2) The dependent variable is normally distributed (Shapiro-Wilk normality test)

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter(across(Imports.of.goods.and.services....of.GDP., ~ !is.na(.))) %>%
  # perform Shapiro-Wilk normality test
  do(tidy(shapiro.test(.$Imports.of.goods.and.services....of.GDP.)))
## # A tibble: 1 x 3
##   statistic  p.value method                     
##       <dbl>    <dbl> <chr>                      
## 1     0.849 1.46e-13 Shapiro-Wilk normality test

We reject the H0 hypothesis that the data is normally distributed based on the p values of our Shapiro-Wilk normality test. Therefore, we can not use a Student’s unpaired t-test and must use a Wilcoxon Rank sums test instead.

7.3) Wilcoxon Rank sums test

dat %>%
  filter(Year > 1990 & (continent == "Europe" | continent == "Asia")) %>%
  # remove rows which contain NAs
  filter(across(Imports.of.goods.and.services....of.GDP., ~ !is.na(.))) %>%
  # perform Wilcoxon Rank sums test
  do(tidy(wilcox.test(.$Imports.of.goods.and.services....of.GDP.~.$continent)))
## # A tibble: 1 x 4
##   statistic p.value method                                           alternative
##       <dbl>   <dbl> <chr>                                            <chr>      
## 1      5707   0.787 Wilcoxon rank sum test with continuity correcti~ two.sided

We reject the H0 hypothesis that there is a significant difference between Europe and Asia with respect to ‘Imports of goods and services (% of GDP)’ in the years after 1990 based on the p values of our Wilcoxon Rank sums test.

8) What is the country (or countries) that has the highest ‘Population density (people per sq. km of land area)’ across all years? (i.e., which country has the highest average ranking in this category across each time point in the dataset?)

First, exploratory data analysis (EDA) was performed on the data.

viz <- dat %>%
  # remove rows which contain NAs
  filter(across(c(Year, Country.Name, Population.density..people.per.sq..km.of.land.area.), ~ !is.na(.))) %>%
  ggplot(aes(x=Year, y=Population.density..people.per.sq..km.of.land.area., group = Country.Name)) + 
  geom_line() + 
  ylab("Population density (people per sq. km of land area)")

ggplotly(viz)
highest.number <- dat %>%
  # remove rows which contain NAs
  filter(across(c(Year, Country.Name, Population.density..people.per.sq..km.of.land.area.), ~ !is.na(.))) %>%
  # split origial tibble in lists depending on the year
  split(.$Year) %>%
  # get maximum of variable in every year
  map(~max(.$Population.density..people.per.sq..km.of.land.area.)) %>%
  # merge list of lists into one list
  unlist

# map highest scoring variable from every year to respective country
highest.ranked.countries <- data.frame(lapply(highest.number, function(x) dat$Country.Name[which(dat$Population.density..people.per.sq..km.of.land.area. == x)]), stringsAsFactors=FALSE)

# count occurences of highest scoring countries
apply(highest.ranked.countries, MARGIN=1, table)
##                  [,1]
## Macao SAR, China    5
## Monaco              5

The countries that have the highest ‘Population density (people per sq. km of land area)’ across all years are Macao SAR and Monaco.

9) What country (or countries) has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962?

First, exploratory data analysis (EDA) was performed on the data.

viz <- dat %>%
  # remove rows which contain NAs
  filter(across(c(Year, Country.Name, Life.expectancy.at.birth..total..years.), ~ !is.na(.))) %>%
  ggplot(aes(x=Year, y=Life.expectancy.at.birth..total..years., group = Country.Name)) + 
  geom_line() + 
  ylab("Life expectancy at birth, total (years)")

ggplotly(viz)

To get the difference for Life expectancy at birth, total (years) for every country between 1962 and 2007 we subtract the value from 2007 minus the value from 1962.

dat %>%
  filter(Year == 1962 | Year == 2007) %>%
  # remove rows which contain NAs
  filter(across(c(Year, Country.Name, Life.expectancy.at.birth..total..years.), ~ !is.na(.))) %>%
  group_by(Country.Name) %>%
  # Calculate difference between 1962 and 2007
  mutate(diff = Life.expectancy.at.birth..total..years. - lag(Life.expectancy.at.birth..total..years., default = 0)) %>% 
  select(Country.Name, diff) %>%
  # keep only rows with calculated difference
  filter(row_number() %% 2 == 0) %>%
  # sort by difference
  arrange(desc(diff))
## # A tibble: 236 x 2
## # Groups:   Country.Name [236]
##    Country.Name        diff
##    <chr>              <dbl>
##  1 Maldives            36.9
##  2 Bhutan              33.2
##  3 Timor-Leste         31.1
##  4 Tunisia             30.9
##  5 Oman                30.8
##  6 Nepal               30.6
##  7 China               29.9
##  8 Yemen, Rep.         27.2
##  9 Saudi Arabia        26.7
## 10 Iran, Islamic Rep.  26.6
## # ... with 226 more rows

Country Maldives has shown the greatest increase in ‘Life expectancy at birth, total (years)’ since 1962.